require(quanteda)
require(stringi)
source("functions.R")

local({
    
    corp <- readRDS("data_corpus_sent_he.RDS")
    toks <- tokenize_hebrew(corp)
    
    toks <- tokens_remove(toks, min_nchar = 3, padding = TRUE)
    toks <- tokens_remove(toks, stopwords("he", "marimo"), padding = TRUE)
    saveRDS(toks, "data_tokens_he.RDS")
    
})


local({
    
    load("collocations_he.Rdata")
    corp <- readRDS("data_corpus_manual_he.RDS")
    toks <- tokens(corp, remove_url = TRUE)
    toks <- tokens_select(toks, "^[\\p{Nd}\\p{L}]+$", valuetype = 'regex', case_insensitive = FALSE,
                          padding = TRUE) # remove symbols
    toks <- tokens_remove(toks, stopwords("he", "marimo"), padding = TRUE)
    toks <- tokens_compound(toks, seqs, concatenator = ' ', join = TRUE)
    saveRDS(toks, "data_tokens_manual_he.RDS")
    
})

local({

    corp <- readRDS("data_corpus_sent_ja.RDS")
    toks <- tokenize_japanese(corp)

    toks <- tokens_remove(toks, '^[ぁ-ん]{1,2}$', valuetype = 'regex', padding = TRUE)
    toks <- tokens_remove(toks, stopwords("ja", "marimo"), padding = TRUE)
    saveRDS(toks, "data_tokens_ja.RDS")

})

local({
    
    load("collocations_ja.Rdata")
    corp <- readRDS("data_corpus_manual_ja.RDS")
    toks <- tokens(corp , remove_url = TRUE)
    toks <- fix_japanese(toks)
    toks <- tokens_select(toks, "^[０-９ぁ-んァ-ヶー一-龠]+$", valuetype = 'regex', padding = TRUE)
    toks <- tokens_remove(toks , stopwords("ja", "marimo"), padding = TRUE)
    toks <- tokens_compound(toks , seqs, concatenator = '', join = TRUE)
    saveRDS(toks, "data_tokens_manual_ja.RDS")
    
})

    